This analysis uses data and information from:

“Stephan Rasp, Peter D. Dueben, Sebastian Scher, Jonathan A. Weyn, Soukayna Mouatadid, and Nils Thuerey, 2020. WeatherBench: A benchmark dataset for data-driven weather forecasting. arXiv: https://arxiv.org/abs/2002.00469

Code can be found here:

https://github.com/pangeo-data/WeatherBench

library(ncdf4)
## Warning: package 'ncdf4' was built under R version 3.5.2
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.2
nc_data <- nc_open('/Users/henry/Desktop/perso/climate/WeatherBench/data/temperature_850hPa_2016_5.625deg.nc')
# put temperature variable t in a 3d array
t <- ncvar_get(nc_data, "t")
dim(t)
## [1]   64   32 8784
t = as.data.table(t)
setnames(t, c("lon","lat","time","temp"))

Persistence model

Persistence = forecast previous seen temperature.

Since we have one observation per hour and per (latitude, longitude) the persistence model consists in taking the lag of the temperature 72 observations earlier (72 hours) for a 3-day-in-advance forecast.

t[, pred := shift(temp, 72, type="lag"), by=list(lon,lat)]

# RMSE = Root Mean Squared Error
t[!is.na(pred), sqrt(mean((pred - temp)^2))]
## [1] 4.748533
# ------------------------------------------------
# RMSE
# ------------------------------------------------
# 4.80 for 2015
# 4.75 for 2016
# 4.80 for 2017
# 4.72 for 2018
# ------------------------------------------------

Analyzing the error (RMSE) of the Persistence model

t[!is.na(pred), list(error = sqrt(mean((pred - temp)^2))), by=list(time)][plot(time, error, ylim=c(0,10), main="RMSE by time")]

## Null data.table (0 rows and 0 cols)
t[!is.na(pred), list(error = sqrt(mean((pred - temp)^2))), by=list(lat)][plot(lat, error, ylim=c(0,10), main="RMSE by latitude")]

## Null data.table (0 rows and 0 cols)
t[!is.na(pred), list(error = sqrt(mean((pred - temp)^2))), by=list(lon)][plot(lon, error, ylim=c(0,10), main="RMSE by longitude")]

## Null data.table (0 rows and 0 cols)

The RMSE by latitude is clearly the most interesting. Extreme latitudes have highest errors by far.

Details of the error by latitude

e = t[!is.na(pred), list(error = sqrt(mean((pred - temp)^2))), by=list(lat, time)]
# calculate moving average
require(zoo)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
e[, error_mavg := frollmean(error, 24*7)]

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
eg = acast(e, time~lat, value.var="error_mavg")
require(plotly)
## Loading required package: plotly
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(z=eg, type="surface") %>% layout(title = "RMSE by latitude and time")

We can see a clear pattern (seen on 2016, 2017, 2018) of higher errors in mid-year (June-July-August) for low latitudes (latitudes 1-5 in the 1-32 scale). The December-January months have high errors in the high latitudes.

As a next step we will look in more details the actual temperatures and forecasts for these latitudes and months where RMSE are the highest.

e
##         lat time    error error_mavg
##      1:   1   73 4.458356         NA
##      2:   1   74 4.461694         NA
##      3:   1   75 4.447963         NA
##      4:   1   76 4.448137         NA
##      5:   1   77 4.453840         NA
##     ---                             
## 278780:  32 8780 4.154393   5.137988
## 278781:  32 8781 4.100235   5.121372
## 278782:  32 8782 4.028268   5.103114
## 278783:  32 8783 4.000203   5.077278
## 278784:  32 8784 3.880402   5.049697
e[lat==1, plot(time, error_mavg, ylim=c(0,12), col="blue", type="l", main="RMSE by time")]
## NULL
e[lat==1, lines(time, error_mavg, ylim=c(0,12), col="blue", lw=2)]
## NULL
e[lat==2, lines(time, error_mavg, ylim=c(0,12), col="orange", lw=2)]
## NULL
e[lat==3, lines(time, error_mavg, ylim=c(0,12), col="green", lw=2)]
## NULL
e[lat==4, lines(time, error_mavg, ylim=c(0,12), col="brown", lw=2)]

## NULL